import scrapy
from scrapy import cmdline
from scrapy.http import HtmlResponse


class Top250Spider(scrapy.Spider):
    """
    如果使用spider类默认的请求方式, 则不会对重复的请求进行过滤
        会重复请求相同的url
    """
    name = "top250"
    # allowed_domains = ["movie.douban.com", "doubanio.com"]
    # start_urls = ["https://movie.douban.com/top250", "https://movie.douban.com/top250?start=25&filter=",
    #               "https://movie.douban.com/top250"]

    # 需要实现对同一个url去重操作, 必须重写start_requests方法
    # 如果请求的地址是api接口, 则需要通过start_requests方法重新构造链接
    def start_requests(self):
        url = 'https://movie.douban.com/top250?start={}&filter='
        for page in range(10):
            yield scrapy.Request(url.format(page * 25), dont_filter=False, callback=self.parse)

    def parse(self, response: HtmlResponse, **kwargs):
        print(response.request.url)
        li_list = response.xpath("//ol[@class='grid_view']/li")
        for li_temp in li_list:
            image_url = li_temp.xpath(".//img/@src").extract_first()
            title = li_temp.xpath(".//span[@class='title'][1]/text()").extract_first()
            rating_num = li_temp.xpath(".//span[@class='rating_num']/text()").extract_first()
            people_num = li_temp.xpath(".//div[@class='star']/span[4]/text()").extract_first()

            yield {
                "type": "info",
                "image": image_url,
                "title": title,
                "rating_num": rating_num,
                "people_num": people_num
            }

            # 创建一个新的request请求去获取图片数据
            # yield scrapy.Request(url=image_url, callback=self.image_parse, cb_kwargs={'image_name': title})

        # 这种翻页方式比较适合静态网站
        # if response.xpath("//span[@class='next']/a/@href"):
        #     next_url = response.urljoin(response.xpath("//span[@class='next']/a/@href").extract_first())
        #     print('下一页的地址:', next_url)
        #     yield scrapy.Request(url=next_url, callback=self.parse)
        # else:
        #     print('全站抓取完成...')

    # def image_parse(self, response, image_name):
    #     yield {
    #         "type": 'image',
    #         "image_name": image_name + '.jpg',
    #         "image_content": response.body
    #     }


if __name__ == '__main__':
    cmdline.execute('scrapy crawl top250'.split())
